1   /*
2    * dk.brics.automaton
3    * 
4    * Copyright (c) 2001-2009 Anders Moeller
5    * All rights reserved.
6    * 
7    * Redistribution and use in source and binary forms, with or without
8    * modification, are permitted provided that the following conditions
9    * are met:
10   * 1. Redistributions of source code must retain the above copyright
11   *    notice, this list of conditions and the following disclaimer.
12   * 2. Redistributions in binary form must reproduce the above copyright
13   *    notice, this list of conditions and the following disclaimer in the
14   *    documentation and/or other materials provided with the distribution.
15   * 3. The name of the author may not be used to endorse or promote products
16   *    derived from this software without specific prior written permission.
17   * 
18   * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19   * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21   * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22   * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23   * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24   * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25   * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27   * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28   */
29  
30  package org.apache.lucene.util.automaton;
31  
32  import org.apache.lucene.util.ArrayUtil;
33  import org.apache.lucene.util.BytesRef;
34  import org.apache.lucene.util.BytesRefBuilder;
35  import org.apache.lucene.util.IntsRef;
36  import org.apache.lucene.util.IntsRefBuilder;
37  import org.apache.lucene.util.RamUsageEstimator;
38  
39  import java.util.ArrayList;
40  import java.util.Arrays;
41  import java.util.BitSet;
42  import java.util.Collection;
43  import java.util.HashMap;
44  import java.util.HashSet;
45  import java.util.LinkedList;
46  import java.util.List;
47  import java.util.Map;
48  import java.util.Set;
49  
50  /**
51   * Automata operations.
52   * 
53   * @lucene.experimental
54   */
55  final public class Operations {
56    /**
57     * Default maximum number of states that {@link Operations#determinize} should create.
58     */
59    public static final int DEFAULT_MAX_DETERMINIZED_STATES = 10000;
60  
61    private Operations() {}
62  
63    /**
64     * Returns an automaton that accepts the concatenation of the languages of the
65     * given automata.
66     * <p>
67     * Complexity: linear in total number of states.
68     */
69    static public Automaton concatenate(Automaton a1, Automaton a2) {
70      return concatenate(Arrays.asList(a1, a2));
71    }
72  
73    /**
74     * Returns an automaton that accepts the concatenation of the languages of the
75     * given automata.
76     * <p>
77     * Complexity: linear in total number of states.
78     */
79    static public Automaton concatenate(List<Automaton> l) {
80      Automaton result = new Automaton();
81  
82      // First pass: create all states
83      for(Automaton a : l) {
84        if (a.getNumStates() == 0) {
85          result.finishState();
86          return result;
87        }
88        int numStates = a.getNumStates();
89        for(int s=0;s<numStates;s++) {
90          result.createState();
91        }
92      }
93  
94      // Second pass: add transitions, carefully linking accept
95      // states of A to init state of next A:
96      int stateOffset = 0;
97      Transition t = new Transition();
98      for(int i=0;i<l.size();i++) {
99        Automaton a = l.get(i);
100       int numStates = a.getNumStates();
101 
102       Automaton nextA = (i == l.size()-1) ? null : l.get(i+1);
103 
104       for(int s=0;s<numStates;s++) {
105         int numTransitions = a.initTransition(s, t);
106         for(int j=0;j<numTransitions;j++) {
107           a.getNextTransition(t);
108           result.addTransition(stateOffset + s, stateOffset + t.dest, t.min, t.max);
109         }
110 
111         if (a.isAccept(s)) {
112           Automaton followA = nextA;
113           int followOffset = stateOffset;
114           int upto = i+1;
115           while (true) {
116             if (followA != null) {
117               // Adds a "virtual" epsilon transition:
118               numTransitions = followA.initTransition(0, t);
119               for(int j=0;j<numTransitions;j++) {
120                 followA.getNextTransition(t);
121                 result.addTransition(stateOffset + s, followOffset + numStates + t.dest, t.min, t.max);
122               }
123               if (followA.isAccept(0)) {
124                 // Keep chaining if followA accepts empty string
125                 followOffset += followA.getNumStates();
126                 followA = (upto == l.size()-1) ? null : l.get(upto+1);
127                 upto++;
128               } else {
129                 break;
130               }
131             } else {
132               result.setAccept(stateOffset + s, true);
133               break;
134             }
135           }
136         }
137       }
138 
139       stateOffset += numStates;
140     }
141 
142     if (result.getNumStates() == 0) {
143       result.createState();
144     }
145 
146     result.finishState();
147 
148     return result;
149   }
150 
151   /**
152    * Returns an automaton that accepts the union of the empty string and the
153    * language of the given automaton.  This may create a dead state.
154    * <p>
155    * Complexity: linear in number of states.
156    */
157   static public Automaton optional(Automaton a) {
158     Automaton result = new Automaton();
159     result.createState();
160     result.setAccept(0, true);
161     if (a.getNumStates() > 0) {
162       result.copy(a);
163       result.addEpsilon(0, 1);
164     }
165     result.finishState();
166     return result;
167   }
168   
169   /**
170    * Returns an automaton that accepts the Kleene star (zero or more
171    * concatenated repetitions) of the language of the given automaton. Never
172    * modifies the input automaton language.
173    * <p>
174    * Complexity: linear in number of states.
175    */
176   static public Automaton repeat(Automaton a) {
177     if (a.getNumStates() == 0) {
178       // Repeating the empty automata will still only accept the empty automata.
179       return a;
180     }
181     Automaton.Builder builder = new Automaton.Builder();
182     builder.createState();
183     builder.setAccept(0, true);
184     builder.copy(a);
185 
186     Transition t = new Transition();
187     int count = a.initTransition(0, t);
188     for(int i=0;i<count;i++) {
189       a.getNextTransition(t);
190       builder.addTransition(0, t.dest+1, t.min, t.max);
191     }
192 
193     int numStates = a.getNumStates();
194     for(int s=0;s<numStates;s++) {
195       if (a.isAccept(s)) {
196         count = a.initTransition(0, t);
197         for(int i=0;i<count;i++) {
198           a.getNextTransition(t);
199           builder.addTransition(s+1, t.dest+1, t.min, t.max);
200         }
201       }
202     }
203 
204     return builder.finish();
205   }
206 
207   /**
208    * Returns an automaton that accepts <code>min</code> or more concatenated
209    * repetitions of the language of the given automaton.
210    * <p>
211    * Complexity: linear in number of states and in <code>min</code>.
212    */
213   static public Automaton repeat(Automaton a, int count) {
214     if (count == 0) {
215       return repeat(a);
216     }
217     List<Automaton> as = new ArrayList<>();
218     while (count-- > 0) {
219       as.add(a);
220     }
221     as.add(repeat(a));
222     return concatenate(as);
223   }
224   
225   /**
226    * Returns an automaton that accepts between <code>min</code> and
227    * <code>max</code> (including both) concatenated repetitions of the language
228    * of the given automaton.
229    * <p>
230    * Complexity: linear in number of states and in <code>min</code> and
231    * <code>max</code>.
232    */
233   static public Automaton repeat(Automaton a, int min, int max) {
234     if (min > max) {
235       return Automata.makeEmpty();
236     }
237 
238     Automaton b;
239     if (min == 0) {
240       b = Automata.makeEmptyString();
241     } else if (min == 1) {
242       b = new Automaton();
243       b.copy(a);
244     } else {
245       List<Automaton> as = new ArrayList<>();
246       for(int i=0;i<min;i++) {
247         as.add(a);
248       }
249       b = concatenate(as);
250     }
251 
252     Set<Integer> prevAcceptStates = toSet(b, 0);
253     Automaton.Builder builder = new Automaton.Builder();
254     builder.copy(b);
255     for(int i=min;i<max;i++) {
256       int numStates = builder.getNumStates();
257       builder.copy(a);
258       for(int s : prevAcceptStates) {
259         builder.addEpsilon(s, numStates);
260       }
261       prevAcceptStates = toSet(a, numStates);
262     }
263 
264     return builder.finish();
265   }
266 
267   private static Set<Integer> toSet(Automaton a, int offset) {
268     int numStates = a.getNumStates();
269     BitSet isAccept = a.getAcceptStates();
270     Set<Integer> result = new HashSet<Integer>();
271     int upto = 0;
272     while (upto < numStates && (upto = isAccept.nextSetBit(upto)) != -1) {
273       result.add(offset+upto);
274       upto++;
275     }
276 
277     return result;
278   }
279   
280   /**
281    * Returns a (deterministic) automaton that accepts the complement of the
282    * language of the given automaton.
283    * <p>
284    * Complexity: linear in number of states if already deterministic and
285    *  exponential otherwise.
286    * @param maxDeterminizedStates maximum number of states determinizing the
287    *  automaton can result in.  Set higher to allow more complex queries and
288    *  lower to prevent memory exhaustion.
289    */
290   static public Automaton complement(Automaton a, int maxDeterminizedStates) {
291     a = totalize(determinize(a, maxDeterminizedStates));
292     int numStates = a.getNumStates();
293     for (int p=0;p<numStates;p++) {
294       a.setAccept(p, !a.isAccept(p));
295     }
296     return removeDeadStates(a);
297   }
298   
299   /**
300    * Returns a (deterministic) automaton that accepts the intersection of the
301    * language of <code>a1</code> and the complement of the language of
302    * <code>a2</code>. As a side-effect, the automata may be determinized, if not
303    * already deterministic.
304    * <p>
305    * Complexity: quadratic in number of states if a2 already deterministic and
306    *  exponential in number of a2's states otherwise.
307    */
308   static public Automaton minus(Automaton a1, Automaton a2, int maxDeterminizedStates) {
309     if (Operations.isEmpty(a1) || a1 == a2) {
310       return Automata.makeEmpty();
311     }
312     if (Operations.isEmpty(a2)) {
313       return a1;
314     }
315     return intersection(a1, complement(a2, maxDeterminizedStates));
316   }
317   
318   /**
319    * Returns an automaton that accepts the intersection of the languages of the
320    * given automata. Never modifies the input automata languages.
321    * <p>
322    * Complexity: quadratic in number of states.
323    */
324   static public Automaton intersection(Automaton a1, Automaton a2) {
325     if (a1 == a2) {
326       return a1;
327     }
328     if (a1.getNumStates() == 0) {
329       return a1;
330     }
331     if (a2.getNumStates() == 0) {
332       return a2;
333     }
334     Transition[][] transitions1 = a1.getSortedTransitions();
335     Transition[][] transitions2 = a2.getSortedTransitions();
336     Automaton c = new Automaton();
337     c.createState();
338     LinkedList<StatePair> worklist = new LinkedList<>();
339     HashMap<StatePair,StatePair> newstates = new HashMap<>();
340     StatePair p = new StatePair(0, 0, 0);
341     worklist.add(p);
342     newstates.put(p, p);
343     while (worklist.size() > 0) {
344       p = worklist.removeFirst();
345       c.setAccept(p.s, a1.isAccept(p.s1) && a2.isAccept(p.s2));
346       Transition[] t1 = transitions1[p.s1];
347       Transition[] t2 = transitions2[p.s2];
348       for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) {
349         while (b2 < t2.length && t2[b2].max < t1[n1].min)
350           b2++;
351         for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++)
352           if (t2[n2].max >= t1[n1].min) {
353             StatePair q = new StatePair(t1[n1].dest, t2[n2].dest);
354             StatePair r = newstates.get(q);
355             if (r == null) {
356               q.s = c.createState();
357               worklist.add(q);
358               newstates.put(q, q);
359               r = q;
360             }
361             int min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min;
362             int max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max;
363             c.addTransition(p.s, r.s, min, max);
364           }
365       }
366     }
367     c.finishState();
368 
369     return removeDeadStates(c);
370   }
371 
372   /** Returns true if these two automata accept exactly the
373    *  same language.  This is a costly computation!  Note
374    *  also that a1 and a2 will be determinized as a side
375    *  effect.  Both automata must be determinized and have
376    *  no dead states! */
377   public static boolean sameLanguage(Automaton a1, Automaton a2) {
378     if (a1 == a2) {
379       return true;
380     }
381     return subsetOf(a2, a1) && subsetOf(a1, a2);
382   }
383 
384   // TODO: move to test-framework?
385   /** Returns true if this automaton has any states that cannot
386    *  be reached from the initial state or cannot reach an accept state.
387    *  Cost is O(numTransitions+numStates). */
388   public static boolean hasDeadStates(Automaton a) {
389     BitSet liveStates = getLiveStates(a);
390     int numLive = liveStates.cardinality();
391     int numStates = a.getNumStates();
392     assert numLive <= numStates: "numLive=" + numLive + " numStates=" + numStates + " " + liveStates;
393     return numLive < numStates;
394   }
395 
396   // TODO: move to test-framework?
397   /** Returns true if there are dead states reachable from an initial state. */
398   public static boolean hasDeadStatesFromInitial(Automaton a) {
399     BitSet reachableFromInitial = getLiveStatesFromInitial(a);
400     BitSet reachableFromAccept = getLiveStatesToAccept(a);
401     reachableFromInitial.andNot(reachableFromAccept);
402     return reachableFromInitial.isEmpty() == false;
403   }
404 
405   // TODO: move to test-framework?
406   /** Returns true if there are dead states that reach an accept state. */
407   public static boolean hasDeadStatesToAccept(Automaton a) {
408     BitSet reachableFromInitial = getLiveStatesFromInitial(a);
409     BitSet reachableFromAccept = getLiveStatesToAccept(a);
410     reachableFromAccept.andNot(reachableFromInitial);
411     return reachableFromAccept.isEmpty() == false;
412   }
413 
414   /**
415    * Returns true if the language of <code>a1</code> is a subset of the language
416    * of <code>a2</code>. Both automata must be determinized and must have no dead
417    * states.
418    * <p>
419    * Complexity: quadratic in number of states.
420    */
421   public static boolean subsetOf(Automaton a1, Automaton a2) {
422     if (a1.isDeterministic() == false) {
423       throw new IllegalArgumentException("a1 must be deterministic");
424     }
425     if (a2.isDeterministic() == false) {
426       throw new IllegalArgumentException("a2 must be deterministic");
427     }
428     assert hasDeadStatesFromInitial(a1) == false;
429     assert hasDeadStatesFromInitial(a2) == false;
430     if (a1.getNumStates() == 0) {
431       // Empty language is alwyas a subset of any other language
432       return true;
433     } else if (a2.getNumStates() == 0) {
434       return isEmpty(a1);
435     }
436 
437     // TODO: cutover to iterators instead
438     Transition[][] transitions1 = a1.getSortedTransitions();
439     Transition[][] transitions2 = a2.getSortedTransitions();
440     LinkedList<StatePair> worklist = new LinkedList<>();
441     HashSet<StatePair> visited = new HashSet<>();
442     StatePair p = new StatePair(0, 0);
443     worklist.add(p);
444     visited.add(p);
445     while (worklist.size() > 0) {
446       p = worklist.removeFirst();
447       if (a1.isAccept(p.s1) && a2.isAccept(p.s2) == false) {
448         return false;
449       }
450       Transition[] t1 = transitions1[p.s1];
451       Transition[] t2 = transitions2[p.s2];
452       for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) {
453         while (b2 < t2.length && t2[b2].max < t1[n1].min) {
454           b2++;
455         }
456         int min1 = t1[n1].min, max1 = t1[n1].max;
457 
458         for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) {
459           if (t2[n2].min > min1) {
460             return false;
461           }
462           if (t2[n2].max < Character.MAX_CODE_POINT) {
463             min1 = t2[n2].max + 1;
464           } else {
465             min1 = Character.MAX_CODE_POINT;
466             max1 = Character.MIN_CODE_POINT;
467           }
468           StatePair q = new StatePair(t1[n1].dest, t2[n2].dest);
469           if (!visited.contains(q)) {
470             worklist.add(q);
471             visited.add(q);
472           }
473         }
474         if (min1 <= max1) {
475           return false;
476         }
477       }
478     }
479     return true;
480   }
481 
482   /**
483    * Returns an automaton that accepts the union of the languages of the given
484    * automata.
485    * <p>
486    * Complexity: linear in number of states.
487    */
488   public static Automaton union(Automaton a1, Automaton a2) {
489     return union(Arrays.asList(a1, a2));
490   }
491 
492   /**
493    * Returns an automaton that accepts the union of the languages of the given
494    * automata.
495    * <p>
496    * Complexity: linear in number of states.
497    */
498   public static Automaton union(Collection<Automaton> l) {
499     Automaton result = new Automaton();
500 
501     // Create initial state:
502     result.createState();
503 
504     // Copy over all automata
505     for(Automaton a : l) {
506       result.copy(a);
507     }
508     
509     // Add epsilon transition from new initial state
510     int stateOffset = 1;
511     for(Automaton a : l) {
512       if (a.getNumStates() == 0) {
513         continue;
514       }
515       result.addEpsilon(0, stateOffset);
516       stateOffset += a.getNumStates();
517     }
518 
519     result.finishState();
520 
521     return removeDeadStates(result);
522   }
523 
524   // Simple custom ArrayList<Transition>
525   private final static class TransitionList {
526     // dest, min, max
527     int[] transitions = new int[3];
528     int next;
529 
530     public void add(Transition t) {
531       if (transitions.length < next+3) {
532         transitions = ArrayUtil.grow(transitions, next+3);
533       }
534       transitions[next] = t.dest;
535       transitions[next+1] = t.min;
536       transitions[next+2] = t.max;
537       next += 3;
538     }
539   }
540 
541   // Holds all transitions that start on this int point, or
542   // end at this point-1
543   private final static class PointTransitions implements Comparable<PointTransitions> {
544     int point;
545     final TransitionList ends = new TransitionList();
546     final TransitionList starts = new TransitionList();
547 
548     @Override
549     public int compareTo(PointTransitions other) {
550       return point - other.point;
551     }
552 
553     public void reset(int point) {
554       this.point = point;
555       ends.next = 0;
556       starts.next = 0;
557     }
558 
559     @Override
560     public boolean equals(Object other) {
561       return ((PointTransitions) other).point == point;
562     }
563 
564     @Override
565     public int hashCode() {
566       return point;
567     }
568   }
569 
570   private final static class PointTransitionSet {
571     int count;
572     PointTransitions[] points = new PointTransitions[5];
573 
574     private final static int HASHMAP_CUTOVER = 30;
575     private final HashMap<Integer,PointTransitions> map = new HashMap<>();
576     private boolean useHash = false;
577 
578     private PointTransitions next(int point) {
579       // 1st time we are seeing this point
580       if (count == points.length) {
581         final PointTransitions[] newArray = new PointTransitions[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
582         System.arraycopy(points, 0, newArray, 0, count);
583         points = newArray;
584       }
585       PointTransitions points0 = points[count];
586       if (points0 == null) {
587         points0 = points[count] = new PointTransitions();
588       }
589       points0.reset(point);
590       count++;
591       return points0;
592     }
593 
594     private PointTransitions find(int point) {
595       if (useHash) {
596         final Integer pi = point;
597         PointTransitions p = map.get(pi);
598         if (p == null) {
599           p = next(point);
600           map.put(pi, p);
601         }
602         return p;
603       } else {
604         for(int i=0;i<count;i++) {
605           if (points[i].point == point) {
606             return points[i];
607           }
608         }
609 
610         final PointTransitions p = next(point);
611         if (count == HASHMAP_CUTOVER) {
612           // switch to HashMap on the fly
613           assert map.size() == 0;
614           for(int i=0;i<count;i++) {
615             map.put(points[i].point, points[i]);
616           }
617           useHash = true;
618         }
619         return p;
620       }
621     }
622 
623     public void reset() {
624       if (useHash) {
625         map.clear();
626         useHash = false;
627       }
628       count = 0;
629     }
630 
631     public void sort() {
632       // Tim sort performs well on already sorted arrays:
633       if (count > 1) ArrayUtil.timSort(points, 0, count);
634     }
635 
636     public void add(Transition t) {
637       find(t.min).starts.add(t);
638       find(1+t.max).ends.add(t);
639     }
640 
641     @Override
642     public String toString() {
643       StringBuilder s = new StringBuilder();
644       for(int i=0;i<count;i++) {
645         if (i > 0) {
646           s.append(' ');
647         }
648         s.append(points[i].point).append(':').append(points[i].starts.next/3).append(',').append(points[i].ends.next/3);
649       }
650       return s.toString();
651     }
652   }
653 
654   /**
655    * Determinizes the given automaton.
656    * <p>
657    * Worst case complexity: exponential in number of states.
658    * @param maxDeterminizedStates Maximum number of states created when
659    *   determinizing.  Higher numbers allow this operation to consume more
660    *   memory but allow more complex automatons.  Use
661    *   DEFAULT_MAX_DETERMINIZED_STATES as a decent default if you don't know
662    *   how many to allow.
663    * @throws TooComplexToDeterminizeException if determinizing a creates an
664    *   automaton with more than maxDeterminizedStates
665    */
666   public static Automaton determinize(Automaton a, int maxDeterminizedStates) {
667     if (a.isDeterministic()) {
668       // Already determinized
669       return a;
670     }
671     if (a.getNumStates() <= 1) {
672       // Already determinized
673       return a;
674     }
675 
676     // subset construction
677     Automaton.Builder b = new Automaton.Builder();
678 
679     //System.out.println("DET:");
680     //a.writeDot("/l/la/lucene/core/detin.dot");
681 
682     SortedIntSet.FrozenIntSet initialset = new SortedIntSet.FrozenIntSet(0, 0);
683 
684     // Create state 0:
685     b.createState();
686 
687     LinkedList<SortedIntSet.FrozenIntSet> worklist = new LinkedList<>();
688     Map<SortedIntSet.FrozenIntSet,Integer> newstate = new HashMap<>();
689 
690     worklist.add(initialset);
691 
692     b.setAccept(0, a.isAccept(0));
693     newstate.put(initialset, 0);
694 
695     // like Set<Integer,PointTransitions>
696     final PointTransitionSet points = new PointTransitionSet();
697 
698     // like SortedMap<Integer,Integer>
699     final SortedIntSet statesSet = new SortedIntSet(5);
700 
701     Transition t = new Transition();
702 
703     while (worklist.size() > 0) {
704       SortedIntSet.FrozenIntSet s = worklist.removeFirst();
705       //System.out.println("det: pop set=" + s);
706 
707       // Collate all outgoing transitions by min/1+max:
708       for(int i=0;i<s.values.length;i++) {
709         final int s0 = s.values[i];
710         int numTransitions = a.getNumTransitions(s0);
711         a.initTransition(s0, t);
712         for(int j=0;j<numTransitions;j++) {
713           a.getNextTransition(t);
714           points.add(t);
715         }
716       }
717 
718       if (points.count == 0) {
719         // No outgoing transitions -- skip it
720         continue;
721       }
722 
723       points.sort();
724 
725       int lastPoint = -1;
726       int accCount = 0;
727 
728       final int r = s.state;
729 
730       for(int i=0;i<points.count;i++) {
731 
732         final int point = points.points[i].point;
733 
734         if (statesSet.upto > 0) {
735           assert lastPoint != -1;
736 
737           statesSet.computeHash();
738           
739           Integer q = newstate.get(statesSet);
740           if (q == null) {
741             q = b.createState();
742             if (q >= maxDeterminizedStates) {
743               throw new TooComplexToDeterminizeException(a, maxDeterminizedStates);
744             }
745             final SortedIntSet.FrozenIntSet p = statesSet.freeze(q);
746             //System.out.println("  make new state=" + q + " -> " + p + " accCount=" + accCount);
747             worklist.add(p);
748             b.setAccept(q, accCount > 0);
749             newstate.put(p, q);
750           } else {
751             assert (accCount > 0 ? true:false) == b.isAccept(q): "accCount=" + accCount + " vs existing accept=" +
752               b.isAccept(q) + " states=" + statesSet;
753           }
754 
755           // System.out.println("  add trans src=" + r + " dest=" + q + " min=" + lastPoint + " max=" + (point-1));
756 
757           b.addTransition(r, q, lastPoint, point-1);
758         }
759 
760         // process transitions that end on this point
761         // (closes an overlapping interval)
762         int[] transitions = points.points[i].ends.transitions;
763         int limit = points.points[i].ends.next;
764         for(int j=0;j<limit;j+=3) {
765           int dest = transitions[j];
766           statesSet.decr(dest);
767           accCount -= a.isAccept(dest) ? 1:0;
768         }
769         points.points[i].ends.next = 0;
770 
771         // process transitions that start on this point
772         // (opens a new interval)
773         transitions = points.points[i].starts.transitions;
774         limit = points.points[i].starts.next;
775         for(int j=0;j<limit;j+=3) {
776           int dest = transitions[j];
777           statesSet.incr(dest);
778           accCount += a.isAccept(dest) ? 1:0;
779         }
780         lastPoint = point;
781         points.points[i].starts.next = 0;
782       }
783       points.reset();
784       assert statesSet.upto == 0: "upto=" + statesSet.upto;
785     }
786 
787     Automaton result = b.finish();
788     assert result.isDeterministic();
789     return result;
790   }
791 
792   /**
793    * Returns true if the given automaton accepts no strings.
794    */
795   public static boolean isEmpty(Automaton a) {
796     if (a.getNumStates() == 0) {
797       // Common case: no states
798       return true;
799     }
800     if (a.isAccept(0) == false && a.getNumTransitions(0) == 0) {
801       // Common case: just one initial state
802       return true;
803     }
804     if (a.isAccept(0) == true) {
805       // Apparently common case: it accepts the damned empty string
806       return false;
807     }
808     
809     LinkedList<Integer> workList = new LinkedList<>();
810     BitSet seen = new BitSet(a.getNumStates());
811     workList.add(0);
812     seen.set(0);
813 
814     Transition t = new Transition();
815     while (workList.isEmpty() == false) {
816       int state = workList.removeFirst();
817       if (a.isAccept(state)) {
818         return false;
819       }
820       int count = a.initTransition(state, t);
821       for(int i=0;i<count;i++) {
822         a.getNextTransition(t);
823         if (seen.get(t.dest) == false) {
824           workList.add(t.dest);
825           seen.set(t.dest);
826         }
827       }
828     }
829 
830     return true;
831   }
832   
833   /**
834    * Returns true if the given automaton accepts all strings.  The automaton must be minimized.
835    */
836   public static boolean isTotal(Automaton a) {
837     return isTotal(a, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
838   }
839 
840   /**
841    * Returns true if the given automaton accepts all strings for the specified min/max
842    * range of the alphabet.  The automaton must be minimized.
843    */
844   public static boolean isTotal(Automaton a, int minAlphabet, int maxAlphabet) {
845     if (a.isAccept(0) && a.getNumTransitions(0) == 1) {
846       Transition t = new Transition();
847       a.getTransition(0, 0, t);
848       return t.dest == 0
849         && t.min == minAlphabet
850         && t.max == maxAlphabet;
851     }
852     return false;
853   }
854   
855   /**
856    * Returns true if the given string is accepted by the automaton.  The input must be deterministic.
857    * <p>
858    * Complexity: linear in the length of the string.
859    * <p>
860    * <b>Note:</b> for full performance, use the {@link RunAutomaton} class.
861    */
862   public static boolean run(Automaton a, String s) {
863     assert a.isDeterministic();
864     int state = 0;
865     for (int i = 0, cp = 0; i < s.length(); i += Character.charCount(cp)) {
866       int nextState = a.step(state, cp = s.codePointAt(i));
867       if (nextState == -1) {
868         return false;
869       }
870       state = nextState;
871     }
872     return a.isAccept(state);
873   }
874 
875   /**
876    * Returns true if the given string (expressed as unicode codepoints) is accepted by the automaton.  The input must be deterministic.
877    * <p>
878    * Complexity: linear in the length of the string.
879    * <p>
880    * <b>Note:</b> for full performance, use the {@link RunAutomaton} class.
881    */
882   public static boolean run(Automaton a, IntsRef s) {
883     assert a.isDeterministic();
884     int state = 0;
885     for (int i=0;i<s.length;i++) {
886       int nextState = a.step(state, s.ints[s.offset+i]);
887       if (nextState == -1) {
888         return false;
889       }
890       state = nextState;
891     }
892     return a.isAccept(state);
893   }
894 
895   /**
896    * Returns the set of live states. A state is "live" if an accept state is
897    * reachable from it and if it is reachable from the initial state.
898    */
899   private static BitSet getLiveStates(Automaton a) {
900     BitSet live = getLiveStatesFromInitial(a);
901     live.and(getLiveStatesToAccept(a));
902     return live;
903   }
904 
905   /** Returns bitset marking states reachable from the initial state. */
906   private static BitSet getLiveStatesFromInitial(Automaton a) {
907     int numStates = a.getNumStates();
908     BitSet live = new BitSet(numStates);
909     if (numStates == 0) {
910       return live;
911     }
912     LinkedList<Integer> workList = new LinkedList<>();
913     live.set(0);
914     workList.add(0);
915 
916     Transition t = new Transition();
917     while (workList.isEmpty() == false) {
918       int s = workList.removeFirst();
919       int count = a.initTransition(s, t);
920       for(int i=0;i<count;i++) {
921         a.getNextTransition(t);
922         if (live.get(t.dest) == false) {
923           live.set(t.dest);
924           workList.add(t.dest);
925         }
926       }
927     }
928 
929     return live;
930   }
931 
932   /** Returns bitset marking states that can reach an accept state. */
933   private static BitSet getLiveStatesToAccept(Automaton a) {
934     Automaton.Builder builder = new Automaton.Builder();
935 
936     // NOTE: not quite the same thing as what SpecialOperations.reverse does:
937     Transition t = new Transition();
938     int numStates = a.getNumStates();
939     for(int s=0;s<numStates;s++) {
940       builder.createState();
941     }
942     for(int s=0;s<numStates;s++) {
943       int count = a.initTransition(s, t);
944       for(int i=0;i<count;i++) {
945         a.getNextTransition(t);
946         builder.addTransition(t.dest, s, t.min, t.max);
947       }
948     }
949     Automaton a2 = builder.finish();
950 
951     LinkedList<Integer> workList = new LinkedList<>();
952     BitSet live = new BitSet(numStates);
953     BitSet acceptBits = a.getAcceptStates();
954     int s = 0;
955     while (s < numStates && (s = acceptBits.nextSetBit(s)) != -1) {
956       live.set(s);
957       workList.add(s);
958       s++;
959     }
960 
961     while (workList.isEmpty() == false) {
962       s = workList.removeFirst();
963       int count = a2.initTransition(s, t);
964       for(int i=0;i<count;i++) {
965         a2.getNextTransition(t);
966         if (live.get(t.dest) == false) {
967           live.set(t.dest);
968           workList.add(t.dest);
969         }
970       }
971     }
972 
973     return live;
974   }
975 
976   /**
977    * Removes transitions to dead states (a state is "dead" if it is not
978    * reachable from the initial state or no accept state is reachable from it.)
979    */
980   public static Automaton removeDeadStates(Automaton a) {
981     int numStates = a.getNumStates();
982     BitSet liveSet = getLiveStates(a);
983 
984     int[] map = new int[numStates];
985 
986     Automaton result = new Automaton();
987     //System.out.println("liveSet: " + liveSet + " numStates=" + numStates);
988     for(int i=0;i<numStates;i++) {
989       if (liveSet.get(i)) {
990         map[i] = result.createState();
991         result.setAccept(map[i], a.isAccept(i));
992       }
993     }
994 
995     Transition t = new Transition();
996 
997     for (int i=0;i<numStates;i++) {
998       if (liveSet.get(i)) {
999         int numTransitions = a.initTransition(i, t);
1000         // filter out transitions to dead states:
1001         for(int j=0;j<numTransitions;j++) {
1002           a.getNextTransition(t);
1003           if (liveSet.get(t.dest)) {
1004             result.addTransition(map[i], map[t.dest], t.min, t.max);
1005           }
1006         }
1007       }
1008     }
1009 
1010     result.finishState();
1011     assert hasDeadStates(result) == false;
1012     return result;
1013   }
1014 
1015   /**
1016    * Finds the largest entry whose value is less than or equal to c, or 0 if
1017    * there is no such entry.
1018    */
1019   static int findIndex(int c, int[] points) {
1020     int a = 0;
1021     int b = points.length;
1022     while (b - a > 1) {
1023       int d = (a + b) >>> 1;
1024       if (points[d] > c) b = d;
1025       else if (points[d] < c) a = d;
1026       else return d;
1027     }
1028     return a;
1029   }
1030   
1031   /**
1032    * Returns true if the language of this automaton is finite.  The
1033    * automaton must not have any dead states.
1034    */
1035   public static boolean isFinite(Automaton a) {
1036     if (a.getNumStates() == 0) {
1037       return true;
1038     }
1039     return isFinite(new Transition(), a, 0, new BitSet(a.getNumStates()), new BitSet(a.getNumStates()));
1040   }
1041   
1042   /**
1043    * Checks whether there is a loop containing state. (This is sufficient since
1044    * there are never transitions to dead states.)
1045    */
1046   // TODO: not great that this is recursive... in theory a
1047   // large automata could exceed java's stack
1048   private static boolean isFinite(Transition scratch, Automaton a, int state, BitSet path, BitSet visited) {
1049     path.set(state);
1050     int numTransitions = a.initTransition(state, scratch);
1051     for(int t=0;t<numTransitions;t++) {
1052       a.getTransition(state, t, scratch);
1053       if (path.get(scratch.dest) || (!visited.get(scratch.dest) && !isFinite(scratch, a, scratch.dest, path, visited))) {
1054         return false;
1055       }
1056     }
1057     path.clear(state);
1058     visited.set(state);
1059     return true;
1060   }
1061   
1062   /**
1063    * Returns the longest string that is a prefix of all accepted strings and
1064    * visits each state at most once.  The automaton must be deterministic.
1065    * 
1066    * @return common prefix, which can be an empty (length 0) String (never null)
1067    */
1068   public static String getCommonPrefix(Automaton a) {
1069     if (a.isDeterministic() == false) {
1070       throw new IllegalArgumentException("input automaton must be deterministic");
1071     }
1072     StringBuilder b = new StringBuilder();
1073     HashSet<Integer> visited = new HashSet<>();
1074     int s = 0;
1075     boolean done;
1076     Transition t = new Transition();
1077     do {
1078       done = true;
1079       visited.add(s);
1080       if (a.isAccept(s) == false && a.getNumTransitions(s) == 1) {
1081         a.getTransition(s, 0, t);
1082         if (t.min == t.max && !visited.contains(t.dest)) {
1083           b.appendCodePoint(t.min);
1084           s = t.dest;
1085           done = false;
1086         }
1087       }
1088     } while (!done);
1089 
1090     return b.toString();
1091   }
1092   
1093   // TODO: this currently requites a determinized machine,
1094   // but it need not -- we can speed it up by walking the
1095   // NFA instead.  it'd still be fail fast.
1096   /**
1097    * Returns the longest BytesRef that is a prefix of all accepted strings and
1098    * visits each state at most once.  The automaton must be deterministic.
1099    * 
1100    * @return common prefix, which can be an empty (length 0) BytesRef (never null)
1101    */
1102   public static BytesRef getCommonPrefixBytesRef(Automaton a) {
1103     BytesRefBuilder builder = new BytesRefBuilder();
1104     HashSet<Integer> visited = new HashSet<>();
1105     int s = 0;
1106     boolean done;
1107     Transition t = new Transition();
1108     do {
1109       done = true;
1110       visited.add(s);
1111       if (a.isAccept(s) == false && a.getNumTransitions(s) == 1) {
1112         a.getTransition(s, 0, t);
1113         if (t.min == t.max && !visited.contains(t.dest)) {
1114           builder.append((byte) t.min);
1115           s = t.dest;
1116           done = false;
1117         }
1118       }
1119     } while (!done);
1120 
1121     return builder.get();
1122   }
1123 
1124   /** If this automaton accepts a single input, return it.  Else, return null.
1125    *  The automaton must be deterministic. */
1126   public static IntsRef getSingleton(Automaton a) {
1127     if (a.isDeterministic() == false) {
1128       throw new IllegalArgumentException("input automaton must be deterministic");
1129     }
1130     IntsRefBuilder builder = new IntsRefBuilder();
1131     HashSet<Integer> visited = new HashSet<>();
1132     int s = 0;
1133     Transition t = new Transition();
1134     while (true) {
1135       visited.add(s);
1136       if (a.isAccept(s) == false) {
1137         if (a.getNumTransitions(s) == 1) {
1138           a.getTransition(s, 0, t);
1139           if (t.min == t.max && !visited.contains(t.dest)) {
1140             builder.append(t.min);
1141             s = t.dest;
1142             continue;
1143           }
1144         }
1145       } else if (a.getNumTransitions(s) == 0) {
1146         return builder.get();
1147       }
1148 
1149       // Automaton accepts more than one string:
1150       return null;
1151     }
1152   }
1153 
1154   /**
1155    * Returns the longest BytesRef that is a suffix of all accepted strings.
1156    * Worst case complexity: exponential in number of states (this calls
1157    * determinize).
1158    * @param maxDeterminizedStates maximum number of states determinizing the
1159    *  automaton can result in.  Set higher to allow more complex queries and
1160    *  lower to prevent memory exhaustion.
1161    * @return common suffix, which can be an empty (length 0) BytesRef (never null)
1162    */
1163   public static BytesRef getCommonSuffixBytesRef(Automaton a, int maxDeterminizedStates) {
1164     // reverse the language of the automaton, then reverse its common prefix.
1165     Automaton r = Operations.determinize(reverse(a), maxDeterminizedStates);
1166     BytesRef ref = getCommonPrefixBytesRef(r);
1167     reverseBytes(ref);
1168     return ref;
1169   }
1170   
1171   private static void reverseBytes(BytesRef ref) {
1172     if (ref.length <= 1) return;
1173     int num = ref.length >> 1;
1174     for (int i = ref.offset; i < ( ref.offset + num ); i++) {
1175       byte b = ref.bytes[i];
1176       ref.bytes[i] = ref.bytes[ref.offset * 2 + ref.length - i - 1];
1177       ref.bytes[ref.offset * 2 + ref.length - i - 1] = b;
1178     }
1179   }
1180 
1181   /** Returns an automaton accepting the reverse language. */
1182   public static Automaton reverse(Automaton a) {
1183     return reverse(a, null);
1184   }
1185 
1186   /** Reverses the automaton, returning the new initial states. */
1187   static Automaton reverse(Automaton a, Set<Integer> initialStates) {
1188 
1189     if (Operations.isEmpty(a)) {
1190       return new Automaton();
1191     }
1192 
1193     int numStates = a.getNumStates();
1194 
1195     // Build a new automaton with all edges reversed
1196     Automaton.Builder builder = new Automaton.Builder();
1197 
1198     // Initial node; we'll add epsilon transitions in the end:
1199     builder.createState();
1200 
1201     for(int s=0;s<numStates;s++) {
1202       builder.createState();
1203     }
1204 
1205     // Old initial state becomes new accept state:
1206     builder.setAccept(1, true);
1207 
1208     Transition t = new Transition();
1209     for (int s=0;s<numStates;s++) {
1210       int numTransitions = a.getNumTransitions(s);
1211       a.initTransition(s, t);
1212       for(int i=0;i<numTransitions;i++) {
1213         a.getNextTransition(t);
1214         builder.addTransition(t.dest+1, s+1, t.min, t.max);
1215       }
1216     }
1217 
1218     Automaton result = builder.finish();
1219 
1220     int s = 0;
1221     BitSet acceptStates = a.getAcceptStates();
1222     while (s < numStates && (s = acceptStates.nextSetBit(s)) != -1) {
1223       result.addEpsilon(0, s+1);
1224       if (initialStates != null) {
1225         initialStates.add(s+1);
1226       }
1227       s++;
1228     }
1229 
1230     result.finishState();
1231 
1232     return result;
1233   }
1234 
1235   /** Returns a new automaton accepting the same language with added
1236    *  transitions to a dead state so that from every state and every label
1237    *  there is a transition. */
1238   static Automaton totalize(Automaton a) {
1239     Automaton result = new Automaton();
1240     int numStates = a.getNumStates();
1241     for(int i=0;i<numStates;i++) {
1242       result.createState();
1243       result.setAccept(i, a.isAccept(i));
1244     }
1245 
1246     int deadState = result.createState();
1247     result.addTransition(deadState, deadState, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
1248 
1249     Transition t = new Transition();
1250     for(int i=0;i<numStates;i++) {
1251       int maxi = Character.MIN_CODE_POINT;
1252       int count = a.initTransition(i, t);
1253       for(int j=0;j<count;j++) {
1254         a.getNextTransition(t);
1255         result.addTransition(i, t.dest, t.min, t.max);
1256         if (t.min > maxi) {
1257           result.addTransition(i, deadState, maxi, t.min-1);
1258         }
1259         if (t.max + 1 > maxi) {
1260           maxi = t.max + 1;
1261         }
1262       }
1263 
1264       if (maxi <= Character.MAX_CODE_POINT) {
1265         result.addTransition(i, deadState, maxi, Character.MAX_CODE_POINT);
1266       }
1267     }
1268 
1269     result.finishState();
1270     return result;
1271   }
1272 
1273   /** Returns the topological sort of all states reachable from
1274    *  the initial state.  Behavior is undefined if this
1275    *  automaton has cycles.  CPU cost is O(numTransitions),
1276    *  and the implementation is recursive so an automaton
1277    *  matching long strings may exhaust the java stack. */
1278   public static int[] topoSortStates(Automaton a) {
1279     if (a.getNumStates() == 0) {
1280       return new int[0];
1281     }
1282     int numStates = a.getNumStates();
1283     int[] states = new int[numStates];
1284     final BitSet visited = new BitSet(numStates);
1285     int upto = topoSortStatesRecurse(a, visited, states, 0, 0);
1286 
1287     if (upto < states.length) {
1288       // There were dead states
1289       int[] newStates = new int[upto];
1290       System.arraycopy(states, 0, newStates, 0, upto);
1291       states = newStates;
1292     }
1293 
1294     // Reverse the order:
1295     for(int i=0;i<states.length/2;i++) {
1296       int s = states[i];
1297       states[i] = states[states.length-1-i];
1298       states[states.length-1-i] = s;
1299     }
1300 
1301     return states;
1302   }
1303 
1304   private static int topoSortStatesRecurse(Automaton a, BitSet visited, int[] states, int upto, int state) {
1305     Transition t = new Transition();
1306     int count = a.initTransition(state, t);
1307     for (int i=0;i<count;i++) {
1308       a.getNextTransition(t);
1309       if (!visited.get(t.dest)) {
1310         visited.set(t.dest);
1311         upto = topoSortStatesRecurse(a, visited, states, upto, t.dest);
1312       }
1313     }
1314     states[upto] = state;
1315     upto++;
1316     return upto;
1317   }
1318 }